In [1]:
import pandas as pd
import re
import numpy as np
import warnings
import datetime

Notes Chunk

To Do List:

  • Add fixturing method using graph processing

    • Learn how to do this first
  • Add option to use one of the following fixturing methods:

    • Graph processing

    • Method based on elo relative to mean - statistical method?

  • Add caching

    • Needs to:

      • Check for the existence of a cache

      • If cache doesn't exist, create it

      • If cache does exist:

        • Check if the result is in the cache

        • If the result is in the cache, retrieve the result and don't process the game "normally"

    • Caching might have issues with teams playing games out of order?

    • Might need to write team elos and fixturing outcomes to the results sheet at the time of fixturing?

      • Maybe this doesn't matter?

In [2]:
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################

# Usecache - process results and cache them
#    Caching needs to be properly implemented
usecache = False

# Process results - probably don't touch this?
processresults = True

# Warnings from pandas keep appearing
# Warning me that value is trying to be set on a copy of a slice from a DataFrame
# Not sure why the warnings are being passed 
suppress_warnings = True

# Dead teams - the teams that do not exist any more - add them to the list
deadteams = ['The Swanbourne Supremacy','']

# Fixture teams - only mark false if you don't want to fixture the teams
fixtureteams = True

# Lots of text output if this is True - mainly for monitoring and testing
# Reporttime for timing - you can say verbose = False and reporttime = True if you only want to see the
# runtime of the script
verbose = True
reporttime = True

In [3]:
# Misc pre-work
if suppress_warnings:
    warnings.filterwarnings('ignore')

# Timing stuff    
starttime = datetime.datetime.now()
resultsfinish = False
fixturingfinish = False

In [4]:
mens_mixed_url = 'https://docs.google.com/spreadsheets/d/15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA/export?format=csv&id=15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA&gid=0'
elos_workbook = 'https://raw.githubusercontent.com/robfox92/HockeyElo/master/Elos_2016b_week_1.csv'
requests_sheet_url = 'https://docs.google.com/spreadsheets/d/15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA/export?format=csv&id=15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA&gid=2236421'

if usecache:
    cached_games=pd.read_csv('processed results.csv')
    
    
mens_mixed_raw = pd.read_csv(mens_mixed_url)
elos_start_raw = pd.read_csv(elos_workbook)
games_requested = pd.read_csv(requests_sheet_url)

In [5]:
def removepunct(str_in):
    for n in [".",",","!","'",'"','\n','-']:
        str_in = str_in.replace(n,'')
    
    str_out = str_in.lower()
    return unicode(str_out)

def parse_round(x):
    y = None
    parsed = None
    if type(x) == str:
        parsed = re.search(r'([0-9]+)+',x).group(1)
        y = int(parsed)
    if (type(x) == int) or (type(x) == float):
        y = int(x)
    return y

def getKfactor(x):
        newteam = 'New Team'
        newteamK = 75
        oldteamK = 50
        if x == newteam:
            out = newteamK
        else:
            out = oldteamK
        return out

In [6]:
# Create game codes
mens_mixed_raw['Game Code'] = mens_mixed_raw['HOME'] + " vs " + mens_mixed_raw['AWAY']
playedgames = mens_mixed_raw['Game Code'].unique()
hometeams = mens_mixed_raw['HOME'].apply(removepunct).unique()

In [7]:
# Only get results that are validated
results = mens_mixed_raw[mens_mixed_raw['Validate'] == 'Y']
results['Round_number'] = results['ROUND'].apply(parse_round)
results['HOME'] = results['HOME'].apply(removepunct)
results['AWAY'] = results['AWAY'].apply(removepunct)
results['Game Code'] = results['Game Code'].apply(removepunct)
results.reset_index(inplace=True)

In [8]:
team_elos = elos_start_raw
team_elos['2016b Teams lower'] = elos_start_raw['2016b Teams'].apply(removepunct)
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))
team_elos['K Factor'] = team_elos['New Team'].apply(getKfactor)
team_K_factors = dict(zip(team_elos['2016b Teams lower'],team_elos['K Factor']))

In [9]:
results['Home Elo'] = None
results['Away Elo'] = None
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))

In [10]:
if usecache:
    processedgames = cached_games['Game Code'].unique()

In [11]:
if verbose:
    print "Beginning processing of results"
    if usecache:
        print "Using cache to avoid repeating result processing"
    if not usecache:
        print "Not using cache - this may slow down the processing"
#%%timeit -n 1 -r 1
team_elos_df = pd.DataFrame.from_dict(elos_dict,orient='index')
if processresults:
    for row in range(0,len(results)):
        if row%50 == 0 and row != 0 and verbose: print row,'results processed'
        # Get the home and away teams
        hometeam = results['HOME'][row]
        awayteam = results['AWAY'][row]
        
        # Check if the game has been processed previously
        if usecache:
            gamecode = results.loc[row,'Game Code']
            if gamecode in processedgames:
            # Grab the row from the cache
                ref = cached_games[cached_games['Game Code'] == gamecode].index.tolist()
                results.loc[row,] = cached_games.loc[ref,]
            
        else:
            # Get the elos from the elo dictionary
            awayElo = elos_dict[awayteam]
            homeElo = elos_dict[hometeam]
            # Write to the df
            results.loc[row,'Home Elo'] = homeElo
            results.loc[row,'Away Elo'] = awayElo
                
            roundnumber = results.loc[row,'Round_number']
            homeK = 50
            awayK = 50
            if type(roundnumber) != str and roundnumber < 4:
                # Get the team K factors
                homeK = team_K_factors[hometeam]
                awayK = team_K_factors[awayteam]
            
    
            
            
            
            # Calculate the Score Expectancies
            homeSE = 1 / (1 + 10 ** -((homeElo - awayElo) / 400))
            awaySE = 1 / (1 + 10 ** ((homeElo - awayElo) / 400))
            # Write to DF
            results.loc[row,'Home Predicted Result'] = homeSE
            results.loc[row,'Away Predicted Result'] = awaySE
        
        
            # Get home, away and total scores
            homescore = results['SCORE'][row]
            awayscore = results['SCORE.1'][row]
            totalscore = homescore+awayscore
        
            # Calculate home and away score percentages
            homeScorePerc = homescore / totalscore
            awayScorePerc = awayscore / totalscore
            # Write to DF
            results.loc[row,'Home Actual Result'] = homeScorePerc
            results.loc[row,'Away Actual Result'] = awayScorePerc

            # Find Elo Changes
            homeNewElo = homeElo + homeK * (homeScorePerc - homeSE)
            awayNewElo = awayElo + awayK * (awayScorePerc - awaySE)

            # Check to ensure winning teams don't lose Elo
            if homescore > awayscore:
                homeNewElo = max(homeNewElo,homeElo)
            if awayscore > homescore:
                awayNewElo = max(awayNewElo,awayElo)

            newElos = {hometeam:homeNewElo, awayteam:awayNewElo}

            elos_dict.update(newElos)
            
    resultsfinish = datetime.datetime.now()
    
if verbose:print '\n',len(results),'total games processed\n\nProcessing complete'


Beginning processing of results
Not using cache - this may slow down the processing
50 results processed
100 results processed
150 results processed
200 results processed
250 results processed
300 results processed
350 results processed
400 results processed
450 results processed
500 results processed
550 results processed

565 total games processed
 Processing complete

In [12]:
elos_df = pd.DataFrame(elos_dict.items())
elos_df.columns = ['Team','Elo']
mean_elo = np.mean(elos_df['Elo'])
elos_df['Relative Elo'] = abs(elos_df['Elo']-mean_elo)
elos_df.sort_values(by='Relative Elo',ascending=False,inplace=True)
elos_df.reset_index(inplace=True)
del elos_df['index']
elos_df['Relative to Mean'] = (elos_df['Elo']-mean_elo)/elos_df['Relative Elo']

games_df = pd.DataFrame()
games_df['Team A'] = None
games_df['Team B'] = None
games_df.head();

In [13]:
allteams = elos_df['Team'].unique()
deadteams = set(deadteams)
for team in deadteams:
    oldname = team
    newname = removepunct(team)
    deadteams = deadteams - {oldname}
    deadteams = deadteams | {newname}

currentteams = set(allteams) - deadteams

In [14]:
elos_df_allteams = elos_df
elos_df = elos_df.ix[elos_df['Team'].isin(currentteams)]
elos_df.reset_index(inplace=True)
del elos_df['index']

In [15]:
if fixtureteams:
    if verbose:print "Beginning fixturing of teams"
#    %%timeit -n 1 -r 1
    byeteam = ''
    if len(elos_df)%2 == 1:
        byeteam = removepunct(raw_input('Enter the Bye Team:\n'))
    
    
    games_df = pd.DataFrame()
    games_df['Team A'] = None
    games_df['Team B'] = None
    
    # Process the requested games here
    
    games_requested['Team 1'] = games_requested['Team 1'].apply(removepunct)
    games_requested['Team 2'] = games_requested['Team 2'].apply(removepunct)
    games_requested['Game Code 1'] = games_requested['Team 1'] + " vs " + games_requested['Team 2']
    games_requested['Game Code 2'] = games_requested['Team 2'] + " vs " + games_requested['Team 1']
    
    for row in range(0,len(games_requested)):
        team1 = games_requested.loc[row,'Team 1']
        team2 = games_requested.loc[row,'Team 2']
        code1 = games_requested.loc[row,'Game Code 1']
        code2 = games_requested.loc[row,'Game Code 2']
        
        # Check if the teams have been fixtured this round
        # Check if the teams have played each other yet
        # Check if either team has been fixtured to have the bye this week
        if (
            team1 not in games_df['Team A'].unique() and team1 not in games_df['Team B'].unique() and
            team2 not in games_df['Team A'].unique() and team2 not in games_df['Team B'].unique() and
            code1 not in playedgames and 
            code2 not in playedgames and
            team1 != byeteam and 
            team2 != byeteam
            ):
                relevantrow = len(games_df)
                games_df.loc[relevantrow,'Team A'] = team1
                games_df.loc[relevantrow,'Team B'] = team2
                games_df.loc[relevantrow,'A Elo'] = elos_dict[team1]
                games_df.loc[relevantrow,'B Elo'] = elos_dict[team2]
    
    
    # Fixture the rest of the games based on relative elo
    #    Teams further from the mean elo are fixtured before teams closer to the mean elo
    
    for row in range(0,len(elos_df)):
        
        # Grab the team name, check to see if they've been fixtured yet
        team = elos_df.loc[row,'Team']
        team_fixtured = (team in games_df['Team A'].unique()) or (team in games_df['Team B'].unique()) or (team == byeteam)
        # Grab the elo and their position relative to the mean
        team_elo = elos_df.loc[row,'Elo']
        rel = elos_df.loc[row,'Relative to Mean']
        if rel == 1:
            team_below_mean = False
        if rel == -1:
            team_below_mean = True
        #else:
            #print "Error in calculating team's elo relative to the mean\nPlease check the team entry in elos_df"
            #team_below_mean = None
        # If they haven't, fixture them to the nearest possible game
        if not team_fixtured:
            elos_df_by_elo = elos_df.sort_values(by='Elo',ascending = team_below_mean)
            
            game_fixtured = False
            roww = 0
            while (not game_fixtured and roww < len(elos_df)-1):
                # Grab a potential team
                possibleteam = elos_df_by_elo.loc[roww,'Team']
                # Check to see if they are the same, and that possible team has not been fixtured already
                if (possibleteam!=team and
                    (possibleteam not in games_df['Team A'].unique()) and
                    (possibleteam not in games_df['Team B'].unique()) and
                    (possibleteam != byeteam) and
                    (abs(elos_dict[team]-elos_dict[possibleteam])<100)
                   ):
                    # Generate two game codes
                    gamecode1 = possibleteam + " vs " + team
                    gamecode2 = team + " vs " + possibleteam
                    
                    if gamecode1 not in playedgames and gamecode2 not in playedgames:
                        relevantrow = len(games_df)
                        games_df.loc[relevantrow,'Team A'] = team
                        games_df.loc[relevantrow,'Team B'] = possibleteam
                        games_df.loc[relevantrow,'A Elo'] = elos_dict[team]
                        games_df.loc[relevantrow,'B Elo'] = elos_dict[possibleteam]        
                        
                        game_fixtured = True
                    else: roww = roww+1
                else: roww = roww+1
    
    
                    
    for team in currentteams:
        teamInA = team in games_df['Team A'].unique()
        teamInB = team in games_df['Team B'].unique()
        teamfixtured = teamInA or teamInB
        if not teamfixtured and verbose:
            if team != byeteam:
                print team,'has not been fixtured'
            elif team == byeteam:
                print team,'was the bye team this round and has not been fixtured'
                    
    games_df.reset_index(inplace = True)
    del games_df['index']
    if verbose:print "\n\nFixturing complete"


Beginning fixturing of teams


Fixturing complete

In [16]:
if fixtureteams:
    games_df['Elo Diff'] = games_df['A Elo'] - games_df['B Elo']

In [17]:
if fixtureteams:
#    %%timeit -n 1 -r 1
    # Work out who is playing at home
    for row in range(0,len(games_df)):
        teamA = games_df.loc[row,'Team A']
        teamB = games_df.loc[row,'Team B']
        teamAhome = len(hometeams[hometeams == teamA])
        teamBhome = len(hometeams[hometeams == teamB])
        
        if teamAhome > teamBhome:
            games_df.loc[row,'Home'] = teamB
            games_df.loc[row,'Away'] = teamA
        else:
            games_df.loc[row,'Home'] = teamA
            games_df.loc[row,'Away'] = teamB
        
        games_df.loc[row,'Game Code'] = games_df.loc[row,'Home'] + " vs " + games_df.loc[row,'Away']
        #print games_df.loc[row,'Game Code']
        
    fixturingfinish = datetime.datetime.now()

In [18]:
if verbose or reporttime:
    if resultsfinish:
        resultstime = resultsfinish - starttime
        print "It took",resultstime,'to process the results'
    if fixturingfinish and resultsfinish:
        fixturetime = fixturingfinish - resultsfinish
        print "It took",fixturetime,'to fixture the teams'
    if fixturingfinish:
        totaltime = fixturingfinish  - starttime
        print "It took",totaltime,'in total to run this script'


It took 0:01:53.449000 to process the results
It took 0:00:01.518000 to fixture the teams
It took 0:01:54.967000 in total to run this script

In [ ]:


In [ ]: